/**   
 * Filename:    Fetcher.java   
 * Create at:   2016年3月28日 下午3:07:14   
 * Description:  
 * Modification History:   
 * Author      Version     Description   
 * ----------------------------------------------------------------- 
 * Garfield      1.0       1.0 Version   
 */
package pga2;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Method;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import pga2.tools.StringUtils;
import pga2.tools.TimeSec;
import pga2.tools.UrlFormat;
import pga2.tools.CreateFile;

public class Fetcher {

	public static void main(String[] args) {
		// CrawlerCL("", "http://www.oistc.com/pocdos/", "div[class=listc] h3");
		String canshu = "-ci@@-file@@C:/Users/ssHss/Desktop/ImageTemp/url.txt@@-input@@C:/Users/ssHss/Desktop/ImageTemp/datax.xml@@-cq@@title,h1[id=artibodyTitle]#date,span[id=pub_date]#nodes,div[id=artibody]";
		System.out.println(CrawlerCI(canshu.split("@@")));
	}

	/**
	 * 局部信息抽取
	 * 
	 * @author Garfield
	 * @data 2016年3月28日
	 * @Emial:344892053@qq.com
	 * @param args
	 * @return
	 */
	public static String RunCralerCw(String[] args) {
		Connection conn = null;
		Document doc = null;
		String gz = "";
		String headerstart = "";
		if (args[0].trim().equals("-cw")) {
			String parm2 = args[1];
			parm2 = UrlFormat.URLINPUTFORMAT(parm2);
			if (StringUtils.IsUrl(parm2)) {
				parm2 = UrlFormat.URLINPUTFORMAT(parm2);
				conn = Jsoup.connect(parm2);
			}
		}
		for (int i = 2; i < args.length;) {
			String parm1 = args[i];
			String parm2 = args[i + 1];
			if (StringUtils.IsEqual(parm1, "-header")) {
				headerstart = "success";
				String[] header = parm2.split("@");
				conn.header(header[0], header[1]);
			}

			if (StringUtils.IsEqual(parm1, "-cookie")) {
				conn.cookie("Cookie", parm2);
			}

			if (StringUtils.IsEqual(parm1, "-data")) {
				conn.method(Method.POST);
				conn.data(parm2);
			}

			if (StringUtils.IsEqual(parm1, "-proxy")) {
				String[] data = parm2.split(":");
				String ip = data[0];
				int port = Integer.valueOf(data[1]);
				conn.proxy(ip, port);
			}
			if (StringUtils.IsEqual(parm1, "-timesec")) {
				TimeSec.Timesec(Integer.valueOf(parm2));
			}
			if (StringUtils.IsEqual(parm1, "-cq")) {
				gz = parm2;
			}
			i = i + 2;
		}
		if (StringUtils.IsBlank(gz)) {
			return "[INFO]The CQ Parm is Fetcher!The -cq IS NULL Exection!";
		}
		if (StringUtils.IsBlank(headerstart)) {
			conn.header(
					"User-Agent",
					"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36");
			System.out
					.println("[INFO]You cannot input UserAgent,Default Set Chrome Safari/537.36");
		}
		conn.timeout(50000);
		conn.ignoreContentType(true);
		if (StringUtils.IsEqual(args[args.length - 1], "-post")) {
			try {
				doc = conn.post();
			} catch (IOException e) {
				System.out.println("[ERROR]" + e.getMessage());
			}
		} else {
			try {
				doc = conn.get();
			} catch (IOException e) {
				System.out.println("[ERROR]" + e.getMessage());
			}
		}
		String data = "Null!";
		if (gz.indexOf("#") != -1) {
			String kvtemp[] = gz.split("#");
			int i = 0;
			while (i < kvtemp.length) {
				// 第一层剥离
				String temp = kvtemp[i];
				String crawltmp[] = temp.split(",");
				// 第二次剥离
				String k = crawltmp[0];
				String v = crawltmp[1];

				data = doc.select(v).text();
				System.out.println(k+":"+data);
                i++;
			}
		} else if (gz.indexOf("#") == -1) {
			String kvtemp[] = gz.split(",");
			int i = 1;
			while (i < kvtemp.length) {
				String k = kvtemp[0];
				String v = kvtemp[1];
				data = doc.select(v).text();
				System.out.println(k+":"+data);
				i++;
			}
		}
		return "[SUCCESS]Fetcher over!";
	}

	/**
	 * CI 爬虫 目录+结构
	 */
	public static String CrawlerCI(String args[]) {
		String filepath = "";
		String fileinput = "";

		if (args[0].trim().equals("-ci")) {

			for (int i = 1; i < args.length;) {
				String parm1 = args[i];

				if (StringUtils.IsEqual(parm1, "-file")) {
					String parm2 = args[i + 1];
					filepath = parm2;
				}
				if (StringUtils.IsEqual(parm1, "-input")) {
					String parm2 = args[i + 1];
					fileinput = parm2;
				}
				i = i + 2;
			}

			if (StringUtils.IsBlank(filepath)) {
				return "filepath no such";
			}
			if (StringUtils.IsBlank(fileinput)) {
				return "fileinput no such";
			}
			File file = new File(filepath);
			BufferedReader reader = null;
			try {
				reader = new BufferedReader(new FileReader(file));
				String tempString = "";
				int id = 0;
				CreateFile.makeTxt(fileinput,
						"<?xml version='1.0' encoding='UTF-8'?>");
				CreateFile.makeTxt(fileinput,
						"<!-- Crawler Tools See Garfields.cc -->");

				while ((tempString = reader.readLine()) != null) {
					id++;
					CreateFile.makeTxt(fileinput, "" + "<group>");
					CreateFile.makeTxt(fileinput, "	<id>" + id + "</id>");
					System.out.println("[INFO]CrawlerIng.....		Count:" + id);
					CralwerCIRun(args, tempString);
					CreateFile.makeTxt(fileinput, "</group>");
				}
				reader.close();
				return "[SUCCESS]Crawler over！";
			} catch (IOException e) {
				return "[ERROR]" + e.getMessage();
			} finally {
				if (reader != null) {
					try {
						reader.close();
					} catch (IOException e1) {
						return "[ERROR]" + e1.getMessage();
					}
				}
			}
		}
		return "Please enter the correct operation order!";
	}

	/**
	 * CI RUN
	 * 
	 * @param url
	 * @param gz
	 * @param inputXML
	 */
	public static String CralwerCIRun(String[] args, String tempString) {
		Connection conn = null;
		Document doc = null;
		String gz = "";
		String headerstart = "";
		String filepath = "";
		String fileinput = "";

		if (args[0].trim().equals("-ci")) {

			for (int i = 1; i < args.length;) {
				String parm1 = args[i];

				conn = Jsoup.connect(tempString);

				if (StringUtils.IsEqual(parm1, "-header")) {
					String parm2 = args[i + 1];
					headerstart = "success";
					String[] header = parm2.split("@");
					conn.header(header[0], header[1]);
				}

				if (StringUtils.IsEqual(parm1, "-cookie")) {
					String parm2 = args[i + 1];
					conn.cookie("Cookie", parm2);
				}

				if (StringUtils.IsEqual(parm1, "-data")) {
					String parm2 = args[i + 1];
					conn.method(Method.POST);
					conn.data(parm2);
				}

				if (StringUtils.IsEqual(parm1, "-proxy")) {
					String parm2 = args[i + 1];
					String[] data = parm2.split(":");
					String ip = data[0];
					int port = Integer.valueOf(data[1]);
					conn.proxy(ip, port);
				}
				if (StringUtils.IsEqual(parm1, "-timesec")) {
					String parm2 = args[i + 1];
					TimeSec.Timesec(Integer.valueOf(parm2));
				}
				if (StringUtils.IsEqual(parm1, "-post")) {
					conn.method(Method.POST);
				}
				if (StringUtils.IsEqual(parm1, "-cq")) {
					String parm2 = args[i + 1];
					gz = parm2;
				}
				if (StringUtils.IsEqual(parm1, "-file")) {
					String parm2 = args[i + 1];
					filepath = parm2;
				}
				if (StringUtils.IsEqual(parm1, "-input")) {
					String parm2 = args[i + 1];
					fileinput = parm2;
				}
				i = i + 2;
			}
			if (StringUtils.IsBlank(gz)) {
				System.out
						.println("[ERROR]The CQ Parm is Fetcher!The -cq IS NULL Exection!");
			}
			if (StringUtils.IsBlank(filepath)) {
				System.out
						.println("[ERROR]You must specify a URLLIST file path !");
			}
			if (StringUtils.IsBlank(fileinput)) {
				System.out
						.println("[ERROR]You must specify a fileinput file path !");
			}
			if (StringUtils.IsBlank(headerstart)) {
				conn.header(
						"User-Agent",
						"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36");
			}
			conn.timeout(50000);
			conn.ignoreContentType(true);

			if (StringUtils.IsEqual(args[args.length - 1], "-post")) {
				try {
					doc = conn.post();
				} catch (IOException e) {
					System.out.println("[ERROR]" + e.getMessage());
				}
			} else {
				try {
					doc = conn.get();
				} catch (IOException e) {
					System.out.println("[ERROR]" + e.getMessage());
				}
			}

			if (gz.indexOf("#") != -1) {
				String kvtemp[] = gz.split("#");
				int i = 0;
				while (i < kvtemp.length) {
					// 第一层剥离
					String temp = kvtemp[i];
					String crawltmp[] = temp.split(",");
					// 第二次剥离
					String k = crawltmp[0];
					String v = crawltmp[1];

					String data = doc.select(v).text();
					CreateFile.makeTxt(fileinput, "	<" + k + ">" + data + "</"
							+ k + ">");
					i++;
				}
			} else if (gz.indexOf("#") == -1) {
				String kvtemp[] = gz.split(",");
				int i = 1;
				while (i < kvtemp.length) {
					String k = kvtemp[0];
					String v = kvtemp[1];

					String data = doc.select(v).text();
					CreateFile.makeTxt(fileinput, "	<" + k + ">" + data + "</"
							+ k + ">");
					i++;
				}
			}
			return "[SUCCESS]Crawler Over! Document Save in" + fileinput;
		} else {
			return "Please enter the correct operation order! -ci";
		}
	}

	/**
	 * List Crawler
	 * 
	 * @param filepath
	 * @param gz
	 * @return
	 */
	public static String CrawlerCL(String filepath, String url, String gz) {
		String msg = "";
		try {
			Document doc;
			Connection conn = Jsoup.connect(url);
			conn.header(
					"User-Agent",
					"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36");
			conn.ignoreContentType(true);
			conn.timeout(10000);
			doc = conn.get();
			Elements emls = doc.select(gz).select("a");
			int i = 0;
			for (Element em : emls) {
				i++;
				String urltemp = em.attr("href");
				urltemp = UrlFormat.Urlformat(url, urltemp);
				System.out.println(i + ":" + urltemp);
				CreateFile.makeTxt(filepath, urltemp);
			}
			msg = "[SUCCESS]爬虫完成爬取,总计URL条数" + i + "条!";
		} catch (IOException e) {
			msg = "[ERROR]" + e.getMessage();
		}

		return msg;
	}


}
